mkdir -p results
# Make the blastdbs - they are output in same folder as the fasta input
conda activate MCHelper
while IFS=',' read -r species strain marta dean; do
makeblastdb -in ${marta} -dbtype "nucl"
done < <(sed '1d' curated_libs.csv)
# Run BLASTS - reciprocal not needed
while IFS=',' read -r species strain marta dean; do
blastn -query ${dean} -db ${marta} -outfmt 6 -max_hsps 1 -out results/${species}_${strain}_dean_vs_marta.blast.out
done < <(sed '1d' curated_libs.csv)Manual curation trials
Comparing results of different TEammo curators
Run BLASTs between curated libraries
Visualisation
if(!require("BiocManager", character.only = T)) install.packages("BiocManager")
pkgs.list <- readLines("r-requirements.txt")
for (i in 1:length(pkgs.list)){
if(!require(pkgs.list[i], character.only = T)){
BiocManager::install(pkgs.list[i], Ncpus = ceiling(parallel::detectCores()/1.7))
require(pkgs.list[i], character.only = TRUE)
}else{require(pkgs.list[1],character.only = TRUE)}
}Load and overall comparisons
setwd("/home/csic/gcy/jgp/extra_storage/dean/mctrials/mctrials")
source("scripts/mccompare_functions.R")
# Color schemes
palette_seq_match <- c(
"Missing from query lib" = "black", # Clear and bold
"Missing from subject lib" = "grey50", # Distinct but neutral
"Present, 70" = RColorBrewer::brewer.pal(3, "Blues")[2], # Medium blue
"Present, 80" = RColorBrewer::brewer.pal(3, "Blues")[3], # Darker blue
"Perfect, 95-100" = RColorBrewer::brewer.pal(3, "Greens")[3] # Bright green
)
# Load the TE classifications
teClassification <- read.table("orozco_classification-2024_mchelper.csv", sep = ";", header = TRUE)
teClassification <- teClassification %>%
mutate(AppName = paste(Class, Order, Superfamily, sep ="/") %>% sub("/$", "", .) %>% sub("/$", "", .))
teClassification <- rbind(
teClassification
, c("", "Unclassified", "", "Unclassified")
)
# Load libraries to compare
# Drosophila birchii
lib_Dbir_marta <- teReadLib(
"../../data/final_libraries/D.birchii/D.birchii.TE_library.fasta",
libIdentifier = "Marta",
species = "D.birchii",
strain = "v1.0_00_GenBank")
lib_Dbir_dean <- teReadLib(
"data/MCH_output/D.birchii/v1.0_00_GenBank/D.birchii_v1.0_00_GenBank_curated-TE-library.fa",
libIdentifier = "Dean",
species = "D.birchii",
strain = "v1.0_00_GenBank")
# Drosophila merina
lib_Dmer_marta <- teReadLib(
"../../data/final_libraries/D.merina/D.merina.TE_library.fasta",
libIdentifier = "Marta",
species = "D.merina", strain = "NA")
lib_Dmer_dean <- teReadLib(
"data/MCH_output/D.merina/NA/D.merina_NA_curated-TE-library.fa",
libIdentifier = "Dean",
species = "D.merina", strain = "NA")
# Drosophila planitibia
lib_Dpla_marta <- teReadLib(
"../../data/final_libraries/D.planitibia/D.planitibia.TE_library.fasta",
libIdentifier = "Marta",
species = "D.planitibia", strain = "NA2")
lib_Dpla_dean <- teReadLib(
"data/MCH_output/D.planitibia/NA2/D.planitibia_NA2_curated-TE-library.fa",
libIdentifier = "Dean",
species = "D.planitibia", strain = "NA2")
# Drosophila santomea
lib_Dsan_marta <- teReadLib(
"../../data/final_libraries/D.santomea/D.santomea.TE_library.fasta",
libIdentifier = "Marta",
species = "D.santomea",
strain = "STO_CAGO_1482_RefSeq")
lib_Dsan_dean <- teReadLib(
"data/MCH_output/D.santomea/STO_CAGO_1482_RefSeq/D.santomea_STO_CAGO_1482_RefSeq_curated-TE-library.fa",
libIdentifier = "Dean",
species = "D.santomea",
strain = "STO_CAGO_1482_RefSeq")
# Drosophila tristis
lib_Dtri_marta <- teReadLib(
"../../data/final_libraries/D.tristis/D.tristis.TE_library.fasta",
libIdentifier = "Marta",
species = "D.tristis", strain = "nanopore_D2")
lib_Dtri_dean <- teReadLib(
"data/MCH_output/D.tristis/nanopore_D2/D.tristis_nanopore_D2_curated-TE-library.fa",
libIdentifier = "Dean",
species = "D.tristis", strain = "nanopore_D2")
# The raw libraries
lib_Dbir_RM2_raw <- teReadLib(
"data/RM2_output/D.birchii/v1.0_00_GenBank/D.birchii_v1.0_00_GenBank-families.fa",
libIdentifier = "RM2_raw",
species = "D.birchii", strain = "v1.0_00_GenBank")
lib_Dmer_RM2_raw <- teReadLib(
"data/RM2_output/D.merina/NA/D.merina-families.fa",
libIdentifier = "RM2_raw",
species = "D.merina", strain = "NA")
lib_Dpla_RM2_raw <- teReadLib(
"data/RM2_output/D.planitibia/NA2/D.planitibia-families.fa",
libIdentifier = "RM2_raw",
species = "D.planitibia", strain = "NA2")
lib_Dsan_RM2_raw <- teReadLib(
"data/RM2_output/D.santomea/STO_CAGO_1482_RefSeq/D.santomea_STO_CAGO_1482_RefSeq-families.fa",
libIdentifier = "RM2_raw",
species = "D.santomea", strain = "STO_CAGO_1482_RefSeq")
lib_Dtri_RM2_raw <- teReadLib(
"data/RM2_output/D.tristis/nanopore_D2/D.tristis_nanopore_D2-families.fa",
libIdentifier = "RM2_raw",
species = "D.tristis", strain = "nanopore_D2")
# MCH auto libraries
lib_Dbir_MCH_auto <- teReadLib(
"data/MCH_output/D.birchii/v1.0_00_GenBank/curated_sequences_NR.fa",
libIdentifier = "MCH_auto",
species = "D.birchii", strain = "v1.0_00_GenBank")
lib_Dmer_MCH_auto <- teReadLib(
"data/MCH_output/D.merina/NA/curated_sequences_NR.fa",
libIdentifier = "MCH_auto",
species = "D.merina", strain = "NA")
lib_Dpla_MCH_auto <- teReadLib(
"data/MCH_output/D.planitibia/NA2/curated_sequences_NR.fa",
libIdentifier = "MCH_auto",
species = "D.planitibia", strain = "NA2")
lib_Dsan_MCH_auto <- teReadLib(
"data/MCH_output/D.santomea/STO_CAGO_1482_RefSeq/curated_sequences_NR.fa",
libIdentifier = "MCH_auto",
species = "D.santomea", strain = "STO_CAGO_1482_RefSeq")
lib_Dtri_MCH_auto <- teReadLib(
"data/MCH_output/D.tristis/nanopore_D2/curated_sequences_NR.fa",
libIdentifier = "MCH_auto",
species = "D.tristis", strain = "nanopore_D2")# Load the blast results comparing curated libraries
blast_Dbir_dean_vs_marta <- LoadBlastComparison(
blast_out = "results/D.birchii_v1.0_00_GenBank_dean_vs_marta.blast.out",
blast_query_lib = lib_Dbir_dean,
blast_subject_lib = lib_Dbir_marta,
species = "D.birchii",
strain = "v1.0_00_GenBank",
comparison = "dean_vs_marta")
blast_Dmer_dean_vs_marta <- LoadBlastComparison(
blast_out = "results/D.merina_NA_dean_vs_marta.blast.out",
blast_query_lib = lib_Dmer_dean,
blast_subject_lib = lib_Dmer_marta,
species = "D.merina",
strain = "NA",
comparison = "dean_vs_marta")
blast_Dpla_dean_vs_marta <- LoadBlastComparison(
blast_out = "results/D.planitibia_NA2_dean_vs_marta.blast.out",
blast_query_lib = lib_Dpla_dean,
blast_subject_lib = lib_Dpla_marta,
species = "D.planitibia",
strain = "NA2",
comparison = "dean_vs_marta")
blast_Dsan_dean_vs_marta <- LoadBlastComparison(
blast_out = "results/D.santomea_STO_CAGO_1482_RefSeq_dean_vs_marta.blast.out",
blast_query_lib = lib_Dsan_dean,
blast_subject_lib = lib_Dsan_marta,
species = "D.santomea",
strain = "STO_CAGO_1482_RefSeq",
comparison = "dean_vs_marta")
blast_Dtri_dean_vs_marta <- LoadBlastComparison(
blast_out = "results/D.tristis_nanopore_D2_dean_vs_marta.blast.out",
blast_query_lib = lib_Dtri_dean,
blast_subject_lib = lib_Dtri_marta,
species = "D.tristis",
strain = "nanopore_D2",
comparison = "dean_vs_marta")
# Bring them all together for overall
blast_all <- bind_rows(
blast_Dbir_dean_vs_marta,
blast_Dmer_dean_vs_marta,
blast_Dpla_dean_vs_marta,
blast_Dsan_dean_vs_marta,
blast_Dtri_dean_vs_marta
)How do the libraries compare overall?
lib_list <- list(
Dbir_RM2raw = lib_Dbir_RM2_raw,
Dmer_RM2raw = lib_Dmer_RM2_raw,
Dpla_RM2raw = lib_Dpla_RM2_raw,
Dsan_RM2raw = lib_Dsan_RM2_raw,
Dtri_RM2raw = lib_Dtri_RM2_raw
)
libs_RM2_raw_size <- summarize_libraries(lib_list)
lib_list <- list(
Dbir_MCHauto = lib_Dbir_MCH_auto,
Dmer_MCHauto = lib_Dmer_MCH_auto,
Dpla_MCHauto = lib_Dpla_MCH_auto,
Dsan_MCHauto = lib_Dsan_MCH_auto,
Dtri_MCHauto = lib_Dtri_MCH_auto
)
libs_MCH_auto_size <- summarize_libraries(lib_list)
lib_list <- list(
Dbir_dean = lib_Dbir_dean,
Dmer_dean = lib_Dmer_dean,
Dpla_dean = lib_Dpla_dean,
Dsan_dean = lib_Dsan_dean,
Dtri_dean = lib_Dtri_dean
)
libs_final_curated_dean_size <- summarize_libraries(lib_list)
lib_list <- list(
Dbir_marta = lib_Dbir_marta,
Dmer_marta = lib_Dmer_marta,
Dpla_marta = lib_Dpla_marta,
Dsan_marta = lib_Dsan_marta,
Dtri_marta = lib_Dtri_marta
)
libs_final_curated_marta_size <- summarize_libraries(lib_list)
lib_sizes <- bind_rows(
libs_RM2_raw_size,
libs_MCH_auto_size,
libs_final_curated_dean_size,
libs_final_curated_marta_size
)
lib_sizes %>%
ggplot(aes(x = species, y = length)) +
theme_bw() +
geom_bar(aes(fill = what), stat = "identity", position = "dodge") +
scale_fill_discrete(name = "Library type") +
theme(axis.text.x = element_text(angle = 90))tePlotLib(list(lib_Dbir_dean, lib_Dbir_marta))tePlotLib(list(lib_Dmer_dean, lib_Dmer_marta))tePlotLib(list(lib_Dpla_dean, lib_Dpla_marta))tePlotLib(list(lib_Dsan_dean, lib_Dsan_marta))tePlotLib(list(lib_Dtri_dean, lib_Dtri_marta))How often do the curators agree based on BLASTn sequence identity between their final curated libraries?
PlotBlastBarMatches(blast_all)PlotBlastBarMatches(blast_Dbir_dean_vs_marta)PlotBlastBarMatches(blast_Dmer_dean_vs_marta)PlotBlastBarMatches(blast_Dpla_dean_vs_marta)PlotBlastBarMatches(blast_Dsan_dean_vs_marta)PlotBlastBarMatches(blast_Dtri_dean_vs_marta)How often do curators agree on TE classifications?
PlotBlastTileMatches(blast_all)PlotBlastTileMatches(blast_Dbir_dean_vs_marta)PlotBlastTileMatches(blast_Dmer_dean_vs_marta)PlotBlastTileMatches(blast_Dpla_dean_vs_marta)PlotBlastTileMatches(blast_Dsan_dean_vs_marta)PlotBlastTileMatches(blast_Dtri_dean_vs_marta)TE size distribution
lib1 <- width(lib_Dtri_dean) %>%
data.frame(length = .)
lib1$curator <- "dean"
lib2 <- width(lib_Dtri_marta) %>%
data.frame(length = .)
lib2$curator <- "marta"
bind_rows(lib1, lib2) %>%
ggplot(aes(x = length)) +
geom_density(aes(fill = curator), alpha = 0.5, color = "black") +
labs(title = "Density Plot of DNA Sequence Lengths",
x = "Sequence Length (bp)",
y = "Density") +
theme_minimal()